Data Preparation¶
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan, normal_ad
from scipy import stats
# reproducible random for assigning random floors
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)
df = pd.read_csv('clean_variable_all.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4665 entries, 0 to 4664 Data columns (total 58 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Area (in Sq. ft) 4665 non-null float64 1 Amount 4665 non-null float64 2 Market Value 4665 non-null float64 3 City 4665 non-null object 4 Project Name 4665 non-null object 5 Location 4665 non-null object 6 Latitude 4665 non-null float64 7 Longitude 4665 non-null float64 8 Sale Type 4665 non-null object 9 Registration Date 4665 non-null object 10 Quaters 4665 non-null object 11 YR 4665 non-null int64 12 Amount in CR 4665 non-null float64 13 Price Range 4665 non-null object 14 Project Name main 2306 non-null object 15 Rate (Rs/sft) 4665 non-null object 16 Unit 4301 non-null object 17 Floor 4506 non-null object 18 Tower 962 non-null object 19 Wing 1438 non-null object 20 Party 1 4665 non-null object 21 Party 2 4665 non-null object 22 Count 4665 non-null object 23 bus_stop_count_1km 4665 non-null int64 24 financial_facilities_1km 4665 non-null int64 25 medical_facilities_1km 4665 non-null int64 26 hospital_facilities_1km 4665 non-null int64 27 distance_to_metro 4665 non-null float64 28 distance_to_railway 4665 non-null float64 29 school_count_1km 4665 non-null int64 30 kindergarten_count_1km 4665 non-null int64 31 distance_to_supermarket 4665 non-null float64 32 distance_to_techhub 4665 non-null float64 33 distance_to_park 4665 non-null float64 34 distance_to_waterbody 4665 non-null float64 35 park_name 750 non-null object 36 park_area_m2 4665 non-null float64 37 park_lat 4665 non-null float64 38 park_lon 4665 non-null float64 39 waterbody_name 3080 non-null object 40 waterbody_waterbody_type_clean 4665 non-null object 41 waterbody_area_m2 4665 non-null float64 42 waterbody_lat 4665 non-null float64 43 waterbody_lon 4665 non-null float64 44 Floor_clean 4665 non-null int64 45 SaleType_encoded 4665 non-null int64 46 park_leisure_garden 4665 non-null bool 47 park_leisure_park 4665 non-null bool 48 park_leisure_pitch 4665 non-null bool 49 park_leisure_playground 4665 non-null bool 50 park_leisure_recreation_common 4665 non-null bool 51 waterbody_type_lake 4665 non-null bool 52 waterbody_type_pond 4665 non-null bool 53 waterbody_type_reservoir 4665 non-null bool 54 waterbody_type_river 4665 non-null bool 55 log_amount 4665 non-null float64 56 area_raw 4665 non-null float64 57 log_area 4665 non-null float64 dtypes: bool(9), float64(21), int64(9), object(19) memory usage: 1.8+ MB
# Log of Amount (dependent variable)
df['log_amount'] = np.log(df['Amount'])
# Option A: keep Area as raw
df['area_raw'] = df['Area (in Sq. ft)']
# Option B: also create log-area (to try log-log model later)
df['log_area'] = np.log(df['Area (in Sq. ft)'])
# Choose predictors
X = df[['log_area', 'Floor_clean', 'SaleType_encoded',
'bus_stop_count_1km', 'financial_facilities_1km',
'medical_facilities_1km',
'distance_to_metro', 'distance_to_railway', 'school_count_1km',
'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub', 'distance_to_park', 'distance_to_waterbody',
'park_area_m2', 'waterbody_area_m2',
'park_leisure_garden', 'park_leisure_park',
'park_leisure_pitch', 'park_leisure_playground',
'park_leisure_recreation_common',
'waterbody_type_lake',
'waterbody_type_pond', 'waterbody_type_reservoir',
'waterbody_type_river']]
# Find all boolean columns
bool_cols = X.select_dtypes(include=['bool']).columns
print(bool_cols)
# Convert all boolean columns to integers
X[bool_cols] = X[bool_cols].astype(int)
Index(['park_leisure_garden', 'park_leisure_park', 'park_leisure_pitch',
'park_leisure_playground', 'park_leisure_recreation_common',
'waterbody_type_lake', 'waterbody_type_pond',
'waterbody_type_reservoir', 'waterbody_type_river'],
dtype='object')
C:\Users\Sayan\AppData\Local\Temp\ipykernel_20080\2290739045.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X[bool_cols] = X[bool_cols].astype(int)
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4665 entries, 0 to 4664 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 log_area 4665 non-null float64 1 Floor_clean 4665 non-null int64 2 SaleType_encoded 4665 non-null int64 3 bus_stop_count_1km 4665 non-null int64 4 financial_facilities_1km 4665 non-null int64 5 medical_facilities_1km 4665 non-null int64 6 distance_to_metro 4665 non-null float64 7 distance_to_railway 4665 non-null float64 8 school_count_1km 4665 non-null int64 9 kindergarten_count_1km 4665 non-null int64 10 distance_to_supermarket 4665 non-null float64 11 distance_to_techhub 4665 non-null float64 12 distance_to_park 4665 non-null float64 13 distance_to_waterbody 4665 non-null float64 14 park_area_m2 4665 non-null float64 15 waterbody_area_m2 4665 non-null float64 16 park_leisure_garden 4665 non-null int32 17 park_leisure_park 4665 non-null int32 18 park_leisure_pitch 4665 non-null int32 19 park_leisure_playground 4665 non-null int32 20 park_leisure_recreation_common 4665 non-null int32 21 waterbody_type_lake 4665 non-null int32 22 waterbody_type_pond 4665 non-null int32 23 waterbody_type_reservoir 4665 non-null int32 24 waterbody_type_river 4665 non-null int32 dtypes: float64(9), int32(9), int64(7) memory usage: 747.3 KB
X_1 = X[['log_area', 'Floor_clean', 'SaleType_encoded',
'bus_stop_count_1km', 'financial_facilities_1km',
'medical_facilities_1km',
'distance_to_metro', 'distance_to_railway', 'school_count_1km',
'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub','distance_to_park', 'distance_to_waterbody',
'park_area_m2', 'waterbody_area_m2'
]]
X_1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4665 entries, 0 to 4664 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 log_area 4665 non-null float64 1 Floor_clean 4665 non-null int64 2 SaleType_encoded 4665 non-null int64 3 bus_stop_count_1km 4665 non-null int64 4 financial_facilities_1km 4665 non-null int64 5 medical_facilities_1km 4665 non-null int64 6 distance_to_metro 4665 non-null float64 7 distance_to_railway 4665 non-null float64 8 school_count_1km 4665 non-null int64 9 kindergarten_count_1km 4665 non-null int64 10 distance_to_supermarket 4665 non-null float64 11 distance_to_techhub 4665 non-null float64 12 distance_to_park 4665 non-null float64 13 distance_to_waterbody 4665 non-null float64 14 park_area_m2 4665 non-null float64 15 waterbody_area_m2 4665 non-null float64 dtypes: float64(9), int64(7) memory usage: 583.3 KB
OLS model¶
from pysal.lib import weights
from pysal.explore import esda
import numpy
import pandas
import geopandas
import matplotlib.pyplot as plt
import seaborn
import contextily
C:\Users\Sayan\anaconda3\Lib\site-packages\spaghetti\network.py:41: FutureWarning: The next major release of pysal/spaghetti (2.0.0) will drop support for all ``libpysal.cg`` geometries. This change is a first step in refactoring ``spaghetti`` that is expected to result in dramatically reduced runtimes for network instantiation and operations. Users currently requiring network and point pattern input as ``libpysal.cg`` geometries should prepare for this simply by converting to ``shapely`` geometries. warnings.warn(dep_msg, FutureWarning, stacklevel=1)
import geopandas as gpd
from libpysal.weights import KNN
from shapely.geometry import Point
# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
crs="EPSG:4326"
)
# Example: k-nearest neighbors (k=8 is common)
w = KNN.from_dataframe(gdf, k=8)
w.transform = "r" # row-standardized
C:\Users\Sayan\anaconda3\Lib\site-packages\libpysal\weights\distance.py:153: UserWarning: The weights matrix is not fully connected: There are 35 disconnected components. W.__init__(self, neighbors, id_order=ids, **kwargs)
from spreg import OLS, diagnostics_sp
from esda import Moran
import matplotlib.pyplot as plt
X_1
| log_area | Floor_clean | SaleType_encoded | bus_stop_count_1km | financial_facilities_1km | medical_facilities_1km | distance_to_metro | distance_to_railway | school_count_1km | kindergarten_count_1km | distance_to_supermarket | distance_to_techhub | distance_to_park | distance_to_waterbody | park_area_m2 | waterbody_area_m2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.117206 | 3 | 0 | 3 | 0 | 1 | 1021.855251 | 1031.314063 | 0 | 0 | 1071.728427 | 5012.910521 | 205.823897 | 963.347727 | 11609.389521 | 15481.502122 |
| 1 | 7.632401 | 4 | 0 | 7 | 7 | 8 | 1624.793018 | 1640.439676 | 2 | 1 | 324.707763 | 1371.896658 | 215.581180 | 816.380253 | 1001.151224 | 403075.658943 |
| 2 | 7.901007 | 4 | 1 | 7 | 7 | 5 | 2323.930856 | 2343.893908 | 7 | 0 | 621.209341 | 2633.436246 | 560.260318 | 1138.757111 | 518.173805 | 12577.559117 |
| 3 | 7.306531 | 4 | 0 | 8 | 12 | 6 | 71.024054 | 87.654521 | 6 | 1 | 144.297691 | 562.039276 | 128.222356 | 671.607186 | 147.420732 | 25026.590223 |
| 4 | 7.841886 | 20 | 0 | 2 | 0 | 0 | 1789.025763 | 1805.479717 | 4 | 0 | 1653.838238 | 3822.477586 | 70.367519 | 1359.177291 | 640.230205 | 163020.728866 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4660 | 7.801391 | 16 | 0 | 12 | 6 | 1 | 7410.102912 | 2553.008614 | 4 | 0 | 1256.031510 | 879.973099 | 238.373796 | 508.605424 | 859.862494 | 383.302762 |
| 4661 | 7.829233 | 26 | 0 | 12 | 6 | 1 | 7410.102912 | 2553.008614 | 4 | 0 | 1256.031510 | 879.973099 | 238.373796 | 508.605424 | 859.862494 | 383.302762 |
| 4662 | 6.806829 | 9 | 0 | 9 | 11 | 6 | 832.198921 | 853.351181 | 3 | 0 | 386.751500 | 768.605871 | 157.895705 | 1122.355482 | 503.926172 | 28561.029058 |
| 4663 | 7.254178 | 0 | 0 | 9 | 11 | 6 | 832.198921 | 853.351181 | 3 | 0 | 386.751500 | 768.605871 | 157.895705 | 1122.355482 | 503.926172 | 28561.029058 |
| 4664 | 7.783224 | 2 | 0 | 9 | 11 | 6 | 832.198921 | 853.351181 | 3 | 0 | 386.751500 | 768.605871 | 157.895705 | 1122.355482 | 503.926172 | 28561.029058 |
4665 rows × 16 columns
# Dependent variable
y = df['log_amount'].values.reshape(-1, 1)
# Run OLS (you've already defined y, X, and w)
ols_model = OLS(y, X, w=w, name_y='log_amount')
# Print full summary
print(ols_model.summary)
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 26
S.D. dependent var : 0.5348 Degrees of Freedom : 4639
R-squared : 0.2534
Adjusted R-squared : 0.2494
Sum squared residual: 995.964 F-statistic : 62.9959
Sigma-square : 0.215 Prob(F-statistic) : 1.966e-271
S.E. of regression : 0.463 Log likelihood : -3017.661
Sigma-square ML : 0.213 Akaike info criterion : 6087.321
S.E of regression ML: 0.4621 Schwarz criterion : 6254.965
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 12.87346 0.16787 76.68839 0.00000
log_area 0.49824 0.01743 28.58996 0.00000
Floor_clean 0.00001 0.00000 2.08643 0.03699
SaleType_encoded -0.08521 0.01555 -5.47991 0.00000
bus_stop_count_1km 0.00050 0.00133 0.37891 0.70477
financial_facilities_1km 0.00974 0.00124 7.83535 0.00000
medical_facilities_1km -0.00529 0.00113 -4.67024 0.00000
distance_to_metro -0.00000 0.00000 -1.18401 0.23647
distance_to_railway -0.00003 0.00001 -4.74976 0.00000
school_count_1km -0.01003 0.00203 -4.94092 0.00000
kindergarten_count_1km 0.01079 0.00327 3.30409 0.00096
distance_to_supermarket -0.00000 0.00002 -0.26503 0.79100
distance_to_techhub -0.00002 0.00001 -3.88549 0.00010
distance_to_park -0.00038 0.00004 -10.41730 0.00000
distance_to_waterbody -0.00005 0.00002 -2.94350 0.00326
park_area_m2 0.00000 0.00000 2.27834 0.02275
waterbody_area_m2 0.00000 0.00000 1.69098 0.09091
park_leisure_garden 0.08824 0.03424 2.57708 0.00999
park_leisure_park -0.04085 0.02277 -1.79403 0.07287
park_leisure_pitch 0.01488 0.02293 0.64887 0.51646
park_leisure_playground -0.06117 0.03054 -2.00312 0.04522
park_leisure_recreation_common 0.12988 0.07940 1.63587 0.10193
waterbody_type_lake -0.38751 0.11103 -3.49002 0.00049
waterbody_type_pond -0.30946 0.11157 -2.77361 0.00557
waterbody_type_reservoir -0.32965 0.11306 -2.91580 0.00356
waterbody_type_river -0.37704 0.11722 -3.21660 0.00131
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 99.397
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 516.320 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 25 985.222 0.0000
Koenker-Bassett test 25 558.714 0.0000
================================ END OF REPORT =====================================
# Run OLS (you've already defined y, X, and w)
ols_model_1 = OLS(y, X_1, w=w, name_y='log_amount')
# Print full summary
print(ols_model_1.summary)
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 17
S.D. dependent var : 0.5348 Degrees of Freedom : 4648
R-squared : 0.2428
Adjusted R-squared : 0.2402
Sum squared residual: 1010.13 F-statistic : 93.1645
Sigma-square : 0.217 Prob(F-statistic) : 6.541e-266
S.E. of regression : 0.466 Log likelihood : -3050.606
Sigma-square ML : 0.217 Akaike info criterion : 6135.212
S.E of regression ML: 0.4653 Schwarz criterion : 6244.825
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 12.50966 0.13012 96.13973 0.00000
log_area 0.49580 0.01752 28.30667 0.00000
Floor_clean 0.00001 0.00000 2.18008 0.02930
SaleType_encoded -0.08561 0.01562 -5.48158 0.00000
bus_stop_count_1km -0.00038 0.00131 -0.28758 0.77368
financial_facilities_1km 0.01033 0.00123 8.36376 0.00000
medical_facilities_1km -0.00479 0.00113 -4.23461 0.00002
distance_to_metro 0.00000 0.00000 0.11785 0.90619
distance_to_railway -0.00003 0.00001 -4.81390 0.00000
school_count_1km -0.01042 0.00202 -5.15633 0.00000
kindergarten_count_1km 0.01087 0.00326 3.33820 0.00085
distance_to_supermarket 0.00000 0.00002 0.27484 0.78345
distance_to_techhub -0.00002 0.00001 -4.42819 0.00001
distance_to_park -0.00038 0.00004 -10.38167 0.00000
distance_to_waterbody -0.00005 0.00002 -2.86311 0.00421
park_area_m2 0.00000 0.00000 2.22295 0.02627
waterbody_area_m2 0.00000 0.00000 0.42946 0.66761
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 79.193
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 578.973 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 16 997.241 0.0000
Koenker-Bassett test 16 552.442 0.0000
================================ END OF REPORT =====================================
'''
X_2 = X_1[['log_area', 'Floor_clean', 'SaleType_encoded',
'bus_stop_count_1km', 'financial_facilities_1km',
'medical_facilities_1km',
'distance_to_metro', 'distance_to_railway', 'school_count_1km',
'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub']]
'''
"\nX_2 = X_1[['log_area', 'Floor_clean', 'SaleType_encoded',\n 'bus_stop_count_1km', 'financial_facilities_1km',\n 'medical_facilities_1km',\n 'distance_to_metro', 'distance_to_railway', 'school_count_1km',\n 'kindergarten_count_1km', 'distance_to_supermarket',\n 'distance_to_techhub']]\n"
'''
# Run OLS (you've already defined y, X, and w)
ols_model_2 = OLS(y, X_2, w=w, name_y='log_amount')
# Print full summary
print(ols_model_2.summary)
'''
"\n# Run OLS (you've already defined y, X, and w)\nols_model_2 = OLS(y, X_2, w=w, name_y='log_amount')\n\n# Print full summary\nprint(ols_model_2.summary)\n"
from esda import Moran
import matplotlib.pyplot as plt
residuals = ols_model.u.flatten() # from your fitted OLS model
mi = Moran(residuals, w)
print("Moran's I:", mi.I)
print("p-value:", mi.p_sim)
Moran's I: 0.2751378925229339 p-value: 0.001
from spreg import LMtests
# Assuming ols_model is your fitted OLS model and w is your spatial weights object
lm_tests = LMtests(ols_model, w)
# Accessing the results
print(f"LM Error: {lm_tests.lme[0]:.3f}, p-value: {lm_tests.lme[1]:.3f}")
print(f"LM Lag: {lm_tests.lml[0]:.3f}, p-value: {lm_tests.lml[1]:.3f}")
print(f"Robust LM Error: {lm_tests.rlme[0]:.3f}, p-value: {lm_tests.rlme[1]:.3f}")
print(f"Robust LM Lag: {lm_tests.rlml[0]:.3f}, p-value: {lm_tests.rlml[1]:.3f}")
print(f"LM SARMA: {lm_tests.sarma[0]:.3f}, p-value: {lm_tests.sarma[1]:.3f}")
LM Error: 1624.173, p-value: 0.000 LM Lag: 1394.095, p-value: 0.000 Robust LM Error: 241.437, p-value: 0.000 Robust LM Lag: 11.359, p-value: 0.001 LM SARMA: 1635.532, p-value: 0.000
''' Step 1: Check significance
All tests have p < 0.05, so spatial dependence exists.
This means your OLS residuals are not independent in space, so a standard OLS is inadequate.
Step 2: Compare robust LM tests
Robust LM Error = 241.437, p = 0.000
Robust LM Lag = 11.359, p = 0.001
Interpretation:
Both robust tests are significant, but Robust LM Error >> Robust LM Lag.
This indicates that the spatial autocorrelation is stronger in the error term than in the dependent variable.
Step 3: Decide next model
Significant Robust LM Error → suggests Spatial Error Model (SEM) is more appropriate.
Significant Robust LM Lag → would suggest Spatial Lag Model (SAR).
Since error is dominant here, start with SEM.
SARMA (joint test) confirms that a spatial model is definitely needed.
✅ Summary / Actionable Recommendation
Your OLS residuals are strongly spatially autocorrelated.
Next step: Fit a Spatial Error Model (SEM) using spreg.GM_Error or spreg.ML_Error.
Optionally, after SEM, you can compare AIC / BIC with SAR to see which fits better. '''
import matplotlib.pyplot as plt
from splot.esda import moran_scatterplot
fig, ax = moran_scatterplot(mi, aspect_equal=True)
ax.set_title("Moran's I Scatterplot (OLS residuals)")
plt.show()
'''
import matplotlib.pyplot as plt
from splot.esda import moran_scatterplot
import os
# Create folder if it doesn't exist
os.makedirs("outputs", exist_ok=True)
# Plot Moran's I
fig, ax = moran_scatterplot(mi, aspect_equal=True)
ax.set_title("Moran's I Scatterplot (OLS residuals)")
# Save figure to 'outputs' folder
fig.savefig("outputs/moran_scatterplot.png", dpi=300, bbox_inches="tight")
plt.show()
'''
'\nimport matplotlib.pyplot as plt\nfrom splot.esda import moran_scatterplot\nimport os\n\n# Create folder if it doesn\'t exist\nos.makedirs("outputs", exist_ok=True)\n\n# Plot Moran\'s I\nfig, ax = moran_scatterplot(mi, aspect_equal=True)\nax.set_title("Moran\'s I Scatterplot (OLS residuals)")\n\n# Save figure to \'outputs\' folder\nfig.savefig("outputs/moran_scatterplot.png", dpi=300, bbox_inches="tight")\n\nplt.show()\n'
LISA MAP¶
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import numpy as np
from esda import Moran_Local
import numpy as np
# y = variable you used for Moran's I (e.g., OLS residuals)
# w = spatial weights matrix (same as used for Moran's I)
lisa = Moran_Local(y, w)
gdf['local_I'] = lisa.Is
gdf['p_value'] = lisa.p_sim
gdf['quadrant'] = lisa.q
gdf['significant'] = lisa.p_sim < 0.05 # 5% significance
import matplotlib.pyplot as plt
import matplotlib.colors as colors
# Only clusters that are statistically significant
sig = gdf['significant']
# Map cluster categories
cluster_labels = {
1: "High-High",
2: "Low-High",
3: "Low-Low",
4: "High-Low"
}
# Create a color map
cluster_colors = {
"High-High": "red",
"Low-Low": "blue",
"High-Low": "lightpink",
"Low-High": "lightblue"
}
gdf['cluster_type'] = gdf.apply(
lambda row: cluster_labels[row['quadrant']] if row['significant'] else "Not Significant",
axis=1
)
# Plot
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(column='cluster_type',
cmap=colors.ListedColormap(["red","blue","lightpink","lightblue","lightgrey"]),
legend=True,
ax=ax)
ax.set_title("LISA Cluster Map (Local Moran’s I)", fontsize=16)
ax.axis("off")
plt.show()
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
# ----------------------------------------------------------
# 1. LOAD DATA
# ----------------------------------------------------------
# Your housing GDF (already includes LISA results)
# gdf = ...
# Load BBMP boundary shapefile
bbmp = gpd.read_file("BBMP_shape.shp")
# ----------------------------------------------------------
# 2. PROJECT BOTH LAYERS (recommended: Web Mercator 3857)
# ----------------------------------------------------------
gdf_web = gdf.to_crs(epsg=3857)
bbmp_web = bbmp.to_crs(epsg=3857)
# ----------------------------------------------------------
# 3. MAP LISA CLUSTERS (color-blind friendly palette)
# ----------------------------------------------------------
cluster_labels = {
1: "High-High",
2: "Low-High",
3: "Low-Low",
4: "High-Low"
}
# Add cluster type with significance filter
gdf_web["cluster_type"] = gdf_web.apply(
lambda row: cluster_labels[row["quadrant"]] if row["significant"] else "Not Significant",
axis=1
)
# Colour-blind friendly palette ("Tol palette")
cluster_palette = {
"High-High": "#D81B60", # reddish-pink
"Low-Low": "#1E88E5", # strong blue
"High-Low": "#FFC107", # yellow
"Low-High": "#004D40", # dark teal
"Not Significant": "#BDBDBD" # grey
}
# Manual colormap
cmap = colors.ListedColormap(list(cluster_palette.values()))
norm = colors.ListedColormap
# ----------------------------------------------------------
# 4. CREATE FIGURE
# ----------------------------------------------------------
fig, ax = plt.subplots(figsize=(12, 12))
# Plot BBMP boundary first
bbmp_web.boundary.plot(ax=ax, color="black", linewidth=0.5)
# Plot LISA clusters
gdf_web.plot(
column="cluster_type",
ax=ax,
categorical=True,
legend=True,
linewidth=0.2,
edgecolor="white",
cmap=colors.ListedColormap([cluster_palette[k] for k in cluster_palette.keys()])
)
# ----------------------------------------------------------
# 5. STYLING
# ----------------------------------------------------------
ax.set_title("LISA Cluster Map – BBMP, Bengaluru (Local Moran’s I)", fontsize=18)
ax.axis("off")
plt.show()
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import contextily as ctx
# ----------------------------------------------------------
# 1. LOAD DATA
# ----------------------------------------------------------
# Your spatial data frame with LISA results:
# gdf = ...
# BBMP boundary
bbmp = gpd.read_file("BBMP_shape.shp")
# ----------------------------------------------------------
# 2. PROJECT EVERYTHING TO WEB MERCATOR 3857
# ----------------------------------------------------------
gdf_web = gdf.to_crs(epsg=3857)
bbmp_web = bbmp.to_crs(epsg=3857)
# ----------------------------------------------------------
# 3. CLASSIFY CLUSTERS
# ----------------------------------------------------------
cluster_labels = {
1: "High-High",
2: "Low-High",
3: "Low-Low",
4: "High-Low"
}
# Assign cluster types only for significant observations
gdf_web["cluster_type"] = gdf_web.apply(
lambda row: cluster_labels[row["quadrant"]] if row["significant"] else "Not Significant",
axis=1
)
# Colour-blind friendly palette
cluster_palette = {
"High-High": "#D81B60", # reddish-pink
"Low-Low": "#1E88E5", # blue
"High-Low": "#FFC107", # yellow
"Low-High": "#004D40", # dark teal
"Not Significant": "#BDBDBD" # neutral grey
}
# ----------------------------------------------------------
# 4. PLOT FIGURE
# ----------------------------------------------------------
fig, ax = plt.subplots(figsize=(13, 13))
# 4A. Add basemap FIRST
# (Empty plot so basemap draws below everything)
bbmp_web.to_crs(epsg=3857).plot(ax=ax, alpha=0) # invisible placeholder to set extent
ctx.add_basemap(
ax,
source=ctx.providers.CartoDB.Positron,
zoom=12
)
# 4B. Plot BBMP boundary
bbmp_web.boundary.plot(ax=ax, color="black", linewidth=0.7)
# 4C. Plot LISA clusters
gdf_web.plot(
column="cluster_type",
ax=ax,
legend=True,
categorical=True,
linewidth=0.3,
edgecolor="white",
cmap=colors.ListedColormap([cluster_palette[c] for c in cluster_palette])
)
# ----------------------------------------------------------
# 5. STYLING
# ----------------------------------------------------------
ax.set_title("LISA Cluster Map – BBMP, Bengaluru (Local Moran’s I)", fontsize=18)
ax.axis("off")
# ----------------------------------------------------------
# 6. SAVE HIGH-RES OUTPUT
# ----------------------------------------------------------
output_path = "outputs/lisa_cluster_bbmp_osm.png"
plt.savefig(output_path, dpi=600, bbox_inches="tight")
plt.close()
print(f"✅ LISA map with OSM basemap saved → {output_path}")
✅ LISA map with OSM basemap saved → outputs/lisa_cluster_bbmp_osm.png
SEM model results¶
from spreg import ML_Error
# --- Spatial Error Model ---
sem_model = ML_Error(y, X, w=w, name_y='log_amount')
# --- Summary ---
print(sem_model.summary)
C:\Users\Sayan\anaconda3\Lib\site-packages\spreg\ml_error.py:184: RuntimeWarning: Method 'bounded' does not support relative tolerance in x; defaulting to absolute tolerance. res = minimize_scalar(
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: ML SPATIAL ERROR (METHOD = full)
---------------------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 26
S.D. dependent var : 0.5348 Degrees of Freedom : 4639
Pseudo R-squared : 0.2463
Log likelihood : -2586.5186
Sigma-square ML : 0.1695 Akaike info criterion : 5225.037
S.E of regression : 0.4117 Schwarz criterion : 5392.681
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 13.06960 0.22321 58.55260 0.00000
log_area 0.48499 0.01651 29.37162 0.00000
Floor_clean 0.00001 0.00000 1.76428 0.07768
SaleType_encoded -0.08610 0.01529 -5.63025 0.00000
bus_stop_count_1km 0.00129 0.00226 0.57039 0.56841
financial_facilities_1km 0.00596 0.00199 2.99286 0.00276
medical_facilities_1km -0.00386 0.00198 -1.94539 0.05173
distance_to_metro -0.00000 0.00001 -0.23950 0.81072
distance_to_railway -0.00002 0.00001 -1.56572 0.11741
school_count_1km -0.00321 0.00340 -0.94557 0.34437
kindergarten_count_1km 0.00702 0.00614 1.14291 0.25307
distance_to_supermarket -0.00004 0.00003 -1.21798 0.22323
distance_to_techhub -0.00002 0.00001 -1.87366 0.06098
distance_to_park -0.00036 0.00005 -7.00329 0.00000
distance_to_waterbody -0.00003 0.00003 -1.08235 0.27910
park_area_m2 0.00000 0.00000 1.14979 0.25023
waterbody_area_m2 0.00000 0.00000 1.51846 0.12890
park_leisure_garden 0.05015 0.04084 1.22803 0.21944
park_leisure_park -0.02732 0.02790 -0.97905 0.32756
park_leisure_pitch 0.01997 0.02781 0.71816 0.47266
park_leisure_playground -0.03781 0.03727 -1.01437 0.31041
park_leisure_recreation_common 0.03083 0.09291 0.33185 0.74000
waterbody_type_lake -0.55142 0.18569 -2.96958 0.00298
waterbody_type_pond -0.49897 0.18555 -2.68921 0.00716
waterbody_type_reservoir -0.50966 0.18848 -2.70410 0.00685
waterbody_type_river -0.52593 0.19341 -2.71931 0.00654
lambda 0.58059 0.01605 36.17973 0.00000
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
from spreg import ML_Error
# --- Spatial Error Model ---
sem_model_1 = ML_Error(y, X_1, w=w, name_y='log_amount')
# --- Summary ---
print(sem_model_1.summary)
C:\Users\Sayan\anaconda3\Lib\site-packages\spreg\ml_error.py:184: RuntimeWarning: Method 'bounded' does not support relative tolerance in x; defaulting to absolute tolerance. res = minimize_scalar(
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: ML SPATIAL ERROR (METHOD = full)
---------------------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 17
S.D. dependent var : 0.5348 Degrees of Freedom : 4648
Pseudo R-squared : 0.2367
Log likelihood : -2597.6105
Sigma-square ML : 0.1700 Akaike info criterion : 5229.221
S.E of regression : 0.4124 Schwarz criterion : 5338.834
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 12.54911 0.13235 94.81613 0.00000
log_area 0.48256 0.01652 29.21551 0.00000
Floor_clean 0.00001 0.00000 1.80679 0.07080
SaleType_encoded -0.08790 0.01530 -5.74553 0.00000
bus_stop_count_1km 0.00094 0.00227 0.41497 0.67816
financial_facilities_1km 0.00620 0.00200 3.10435 0.00191
medical_facilities_1km -0.00390 0.00199 -1.95426 0.05067
distance_to_metro 0.00000 0.00001 0.11247 0.91045
distance_to_railway -0.00002 0.00001 -1.37226 0.16998
school_count_1km -0.00323 0.00343 -0.94216 0.34611
kindergarten_count_1km 0.00700 0.00620 1.12899 0.25890
distance_to_supermarket -0.00003 0.00003 -1.14969 0.25027
distance_to_techhub -0.00002 0.00001 -2.16373 0.03049
distance_to_park -0.00036 0.00005 -7.00452 0.00000
distance_to_waterbody -0.00003 0.00003 -1.05035 0.29356
park_area_m2 0.00000 0.00000 1.20651 0.22762
waterbody_area_m2 0.00000 0.00000 1.02931 0.30333
lambda 0.58748 0.01583 37.12110 0.00000
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
SLM model¶
from spreg import ML_Lag
# --- Spatial Lag Model ---
slm_model = ML_Lag(y, X, w=w, name_y='log_amount')
# --- Summary ---
print(slm_model.summary)
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG (METHOD = FULL)
-----------------------------------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 27
S.D. dependent var : 0.5348 Degrees of Freedom : 4638
Pseudo R-squared : 0.3947
Spatial Pseudo R-squared: 0.2422
Log likelihood : -2619.6914
Sigma-square ML : 0.1737 Akaike info criterion : 5293.383
S.E of regression : 0.4168 Schwarz criterion : 5467.475
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 4.73382 0.28530 16.59252 0.00000
log_area 0.44044 0.01592 27.65873 0.00000
Floor_clean 0.00001 0.00000 1.78350 0.07451
SaleType_encoded -0.06951 0.01401 -4.96235 0.00000
bus_stop_count_1km -0.00082 0.00120 -0.68796 0.49148
financial_facilities_1km 0.00449 0.00113 3.98336 0.00007
medical_facilities_1km -0.00168 0.00102 -1.64525 0.09992
distance_to_metro -0.00000 0.00000 -1.25982 0.20773
distance_to_railway -0.00001 0.00001 -2.36374 0.01809
school_count_1km -0.00459 0.00183 -2.50621 0.01220
kindergarten_count_1km 0.00292 0.00294 0.99192 0.32124
distance_to_supermarket 0.00001 0.00002 0.60447 0.54553
distance_to_techhub -0.00001 0.00000 -1.41252 0.15780
distance_to_park -0.00022 0.00003 -6.54467 0.00000
distance_to_waterbody -0.00003 0.00002 -1.65230 0.09847
park_area_m2 0.00000 0.00000 0.38296 0.70175
waterbody_area_m2 0.00000 0.00000 1.43595 0.15102
park_leisure_garden 0.05183 0.03080 1.68282 0.09241
park_leisure_park -0.02993 0.02049 -1.46074 0.14409
park_leisure_pitch 0.00732 0.02063 0.35471 0.72281
park_leisure_playground -0.04145 0.02748 -1.50819 0.13151
park_leisure_recreation_common 0.03288 0.07143 0.46029 0.64531
waterbody_type_lake -0.23071 0.10011 -2.30458 0.02119
waterbody_type_pond -0.19092 0.10054 -1.89887 0.05758
waterbody_type_reservoir -0.18988 0.10186 -1.86406 0.06231
waterbody_type_river -0.21463 0.10561 -2.03227 0.04213
W_log_amount 0.52432 0.01594 32.90025 0.00000
------------------------------------------------------------------------------------
SPATIAL LAG MODEL IMPACTS
Impacts computed using the 'simple' method.
Variable Direct Indirect Total
log_area 0.4404 0.4855 0.9259
Floor_clean 0.0000 0.0000 0.0000
SaleType_encoded -0.0695 -0.0766 -0.1461
bus_stop_count_1km -0.0008 -0.0009 -0.0017
financial_facilities_1km 0.0045 0.0049 0.0094
medical_facilities_1km -0.0017 -0.0019 -0.0035
distance_to_metro -0.0000 -0.0000 -0.0000
distance_to_railway -0.0000 -0.0000 -0.0000
school_count_1km -0.0046 -0.0051 -0.0096
kindergarten_count_1km 0.0029 0.0032 0.0061
distance_to_supermarket 0.0000 0.0000 0.0000
distance_to_techhub -0.0000 -0.0000 -0.0000
distance_to_park -0.0002 -0.0002 -0.0005
distance_to_waterbody -0.0000 -0.0000 -0.0001
park_area_m2 0.0000 0.0000 0.0000
waterbody_area_m2 0.0000 0.0000 0.0000
park_leisure_garden 0.0518 0.0571 0.1090
park_leisure_park -0.0299 -0.0330 -0.0629
park_leisure_pitch 0.0073 0.0081 0.0154
park_leisure_playground -0.0415 -0.0457 -0.0871
park_leisure_recreation_common 0.0329 0.0362 0.0691
waterbody_type_lake -0.2307 -0.2543 -0.4850
waterbody_type_pond -0.1909 -0.2104 -0.4014
waterbody_type_reservoir -0.1899 -0.2093 -0.3992
waterbody_type_river -0.2146 -0.2366 -0.4512
================================ END OF REPORT =====================================
from spreg import ML_Lag
# --- Spatial Lag Model ---
slm_model_1 = ML_Lag(y, X_1, w=w, name_y='log_amount')
# --- Summary ---
print(slm_model_1.summary)
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG (METHOD = FULL)
-----------------------------------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 18
S.D. dependent var : 0.5348 Degrees of Freedom : 4647
Pseudo R-squared : 0.3922
Spatial Pseudo R-squared: 0.2304
Log likelihood : -2633.8689
Sigma-square ML : 0.1745 Akaike info criterion : 5303.738
S.E of regression : 0.4177 Schwarz criterion : 5419.799
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 4.37808 0.26032 16.81826 0.00000
log_area 0.43811 0.01594 27.49324 0.00000
Floor_clean 0.00001 0.00000 1.84520 0.06501
SaleType_encoded -0.06972 0.01402 -4.97357 0.00000
bus_stop_count_1km -0.00149 0.00117 -1.27244 0.20322
financial_facilities_1km 0.00477 0.00111 4.28054 0.00002
medical_facilities_1km -0.00134 0.00102 -1.32133 0.18639
distance_to_metro -0.00000 0.00000 -0.47065 0.63789
distance_to_railway -0.00001 0.00001 -2.40006 0.01639
school_count_1km -0.00483 0.00181 -2.66346 0.00773
kindergarten_count_1km 0.00284 0.00292 0.97097 0.33156
distance_to_supermarket 0.00002 0.00002 1.01730 0.30901
distance_to_techhub -0.00001 0.00000 -1.70910 0.08743
distance_to_park -0.00021 0.00003 -6.39919 0.00000
distance_to_waterbody -0.00003 0.00002 -1.55840 0.11914
park_area_m2 0.00000 0.00000 0.32814 0.74281
waterbody_area_m2 0.00000 0.00000 0.67907 0.49710
W_log_amount 0.53324 0.01576 33.84253 0.00000
------------------------------------------------------------------------------------
SPATIAL LAG MODEL IMPACTS
Impacts computed using the 'simple' method.
Variable Direct Indirect Total
log_area 0.4381 0.5005 0.9386
Floor_clean 0.0000 0.0000 0.0000
SaleType_encoded -0.0697 -0.0796 -0.1494
bus_stop_count_1km -0.0015 -0.0017 -0.0032
financial_facilities_1km 0.0048 0.0054 0.0102
medical_facilities_1km -0.0013 -0.0015 -0.0029
distance_to_metro -0.0000 -0.0000 -0.0000
distance_to_railway -0.0000 -0.0000 -0.0000
school_count_1km -0.0048 -0.0055 -0.0104
kindergarten_count_1km 0.0028 0.0032 0.0061
distance_to_supermarket 0.0000 0.0000 0.0000
distance_to_techhub -0.0000 -0.0000 -0.0000
distance_to_park -0.0002 -0.0002 -0.0004
distance_to_waterbody -0.0000 -0.0000 -0.0001
park_area_m2 0.0000 0.0000 0.0000
waterbody_area_m2 0.0000 0.0000 0.0000
================================ END OF REPORT =====================================
from spreg import GM_Combo_Het
# --- SARAR (GMM version) ---
sarar_gmm = GM_Combo_Het(
y,
X,
w=w,
name_y='log_amount'
)
print(sarar_gmm.summary)
REGRESSION RESULTS
------------------
SUMMARY OF OUTPUT: SPATIALLY WEIGHTED 2SLS- GM-COMBO MODEL (HET)
----------------------------------------------------------------
Data set : unknown
Weights matrix : unknown
Dependent Variable : log_amount Number of Observations: 4665
Mean dependent var : 15.7608 Number of Variables : 27
S.D. dependent var : 0.5348 Degrees of Freedom : 4638
Pseudo R-squared : 0.3339
Spatial Pseudo R-squared: 0.2502
N. of iterations : 1 Step1c computed : No
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 9.50360 1.41938 6.69561 0.00000
log_area 0.48378 0.03522 13.73422 0.00000
Floor_clean 0.00001 0.00000 8.98407 0.00000
SaleType_encoded -0.08417 0.01604 -5.24836 0.00000
bus_stop_count_1km 0.00022 0.00216 0.10112 0.91945
financial_facilities_1km 0.00571 0.00209 2.73861 0.00617
medical_facilities_1km -0.00310 0.00184 -1.68353 0.09227
distance_to_metro -0.00000 0.00000 -0.47519 0.63465
distance_to_railway -0.00002 0.00001 -1.61853 0.10555
school_count_1km -0.00426 0.00295 -1.44683 0.14795
kindergarten_count_1km 0.00537 0.00546 0.98251 0.32585
distance_to_supermarket -0.00002 0.00003 -0.63886 0.52292
distance_to_techhub -0.00001 0.00001 -1.40858 0.15896
distance_to_park -0.00031 0.00005 -6.12755 0.00000
distance_to_waterbody -0.00004 0.00003 -1.24871 0.21177
park_area_m2 0.00000 0.00000 1.47241 0.14091
waterbody_area_m2 0.00000 0.00000 1.30810 0.19084
park_leisure_garden 0.05579 0.03854 1.44755 0.14774
park_leisure_park -0.02822 0.02750 -1.02614 0.30482
park_leisure_pitch 0.01645 0.02625 0.62676 0.53082
park_leisure_playground -0.04163 0.03440 -1.21023 0.22619
park_leisure_recreation_common 0.03133 0.11042 0.28374 0.77661
waterbody_type_lake -0.44360 0.25206 -1.75988 0.07843
waterbody_type_pond -0.39358 0.25207 -1.56137 0.11844
waterbody_type_reservoir -0.40034 0.25376 -1.57760 0.11466
waterbody_type_river -0.41660 0.25807 -1.61427 0.10647
W_log_amount 0.21869 0.08171 2.67626 0.00744
lambda 0.48136 0.06665 7.22237 0.00000
------------------------------------------------------------------------------------
Instrumented: W_log_amount
Instruments: W_Floor_clean, W_SaleType_encoded, W_bus_stop_count_1km,
W_distance_to_metro, W_distance_to_park, W_distance_to_railway,
W_distance_to_supermarket, W_distance_to_techhub,
W_distance_to_waterbody, W_financial_facilities_1km,
W_kindergarten_count_1km, W_log_area, W_medical_facilities_1km,
W_park_area_m2, W_park_leisure_garden, W_park_leisure_park,
W_park_leisure_pitch, W_park_leisure_playground,
W_park_leisure_recreation_common, W_school_count_1km,
W_waterbody_area_m2, W_waterbody_type_lake,
W_waterbody_type_pond, W_waterbody_type_reservoir,
W_waterbody_type_river
================================ END OF REPORT =====================================
import numpy as np
# number of observations
n = sarar_gmm.n # or len(y)
# number of parameters estimated (intercept + betas + spatial lags)
k = sarar_gmm.k
# residuals
residuals = sarar_gmm.u # residuals vector
# sum of squared residuals
ssr = np.sum(residuals ** 2)
# pseudo log-likelihood under Gaussian errors
llf = -n/2 * (np.log(2 * np.pi) + 1 + np.log(ssr / n))
# approximate AIC
AIC = -2 * llf + 2 * k
print(f"Approximate AIC (GMM-SARAR): {AIC:.3f}")
Approximate AIC (GMM-SARAR): 5584.546
from esda.moran import Moran
import numpy as np
# Moran's I for residuals
mi_ols = Moran(ols_model.u, w)
mi_slm = Moran(slm_model.u, w)
mi_sem = Moran(sem_model.u, w)
mi_sarar = Moran(sarar_gmm.u, w)
print("Moran's I of residuals:")
print(f"OLS: {mi_ols.I:.3f}")
print(f"SLM: {mi_slm.I:.3f}")
print(f"SEM: {mi_sem.I:.3f}")
print(f"SARAR: {mi_sarar.I:.3f}")
Moran's I of residuals: OLS: 0.275 SLM: 0.037 SEM: 0.288 SARAR: 0.175
GWR¶
pip install mgwr
Requirement already satisfied: mgwr in c:\users\sayan\anaconda3\lib\site-packages (2.2.1) Requirement already satisfied: scipy>=0.11 in c:\users\sayan\anaconda3\lib\site-packages (from mgwr) (1.13.1) Requirement already satisfied: numpy>=1.3 in c:\users\sayan\anaconda3\lib\site-packages (from mgwr) (1.26.4) Requirement already satisfied: libpysal>=4.0.0 in c:\users\sayan\anaconda3\lib\site-packages (from mgwr) (4.13.0) Requirement already satisfied: spglm>=1.0.6 in c:\users\sayan\anaconda3\lib\site-packages (from mgwr) (1.1.0) Requirement already satisfied: spreg in c:\users\sayan\anaconda3\lib\site-packages (from mgwr) (1.8.3) Requirement already satisfied: beautifulsoup4>=4.10 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (4.12.3) Requirement already satisfied: geopandas>=0.10.0 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (1.0.1) Requirement already satisfied: packaging>=22 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (24.1) Requirement already satisfied: pandas>=1.4 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (2.2.2) Requirement already satisfied: platformdirs>=2.0.2 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (3.10.0) Requirement already satisfied: requests>=2.27 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (2.32.3) Requirement already satisfied: shapely>=2.0.1 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (2.0.7) Requirement already satisfied: scikit-learn>=1.1 in c:\users\sayan\anaconda3\lib\site-packages (from libpysal>=4.0.0->mgwr) (1.5.1) Requirement already satisfied: soupsieve>1.2 in c:\users\sayan\anaconda3\lib\site-packages (from beautifulsoup4>=4.10->libpysal>=4.0.0->mgwr) (2.5) Requirement already satisfied: pyogrio>=0.7.2 in c:\users\sayan\anaconda3\lib\site-packages (from geopandas>=0.10.0->libpysal>=4.0.0->mgwr) (0.10.0) Requirement already satisfied: pyproj>=3.3.0 in c:\users\sayan\anaconda3\lib\site-packages (from geopandas>=0.10.0->libpysal>=4.0.0->mgwr) (3.7.1) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\sayan\anaconda3\lib\site-packages (from pandas>=1.4->libpysal>=4.0.0->mgwr) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\sayan\anaconda3\lib\site-packages (from pandas>=1.4->libpysal>=4.0.0->mgwr) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\sayan\anaconda3\lib\site-packages (from pandas>=1.4->libpysal>=4.0.0->mgwr) (2023.3) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\sayan\anaconda3\lib\site-packages (from requests>=2.27->libpysal>=4.0.0->mgwr) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\sayan\anaconda3\lib\site-packages (from requests>=2.27->libpysal>=4.0.0->mgwr) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\sayan\anaconda3\lib\site-packages (from requests>=2.27->libpysal>=4.0.0->mgwr) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\sayan\anaconda3\lib\site-packages (from requests>=2.27->libpysal>=4.0.0->mgwr) (2025.1.31) Requirement already satisfied: joblib>=1.2.0 in c:\users\sayan\anaconda3\lib\site-packages (from scikit-learn>=1.1->libpysal>=4.0.0->mgwr) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\sayan\anaconda3\lib\site-packages (from scikit-learn>=1.1->libpysal>=4.0.0->mgwr) (3.5.0) Requirement already satisfied: six>=1.5 in c:\users\sayan\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=1.4->libpysal>=4.0.0->mgwr) (1.16.0) Note: you may need to restart the kernel to use updated packages.
# Find all boolean columns
bool_cols = df.select_dtypes(include=['bool']).columns
print(bool_cols)
# Convert all boolean columns to integers
df[bool_cols] = df[bool_cols].astype(int)
Index(['park_leisure_garden', 'park_leisure_park', 'park_leisure_pitch',
'park_leisure_playground', 'park_leisure_recreation_common',
'waterbody_type_lake', 'waterbody_type_pond',
'waterbody_type_reservoir', 'waterbody_type_river'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4665 entries, 0 to 4664 Data columns (total 58 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Area (in Sq. ft) 4665 non-null float64 1 Amount 4665 non-null float64 2 Market Value 4665 non-null float64 3 City 4665 non-null object 4 Project Name 4665 non-null object 5 Location 4665 non-null object 6 Latitude 4665 non-null float64 7 Longitude 4665 non-null float64 8 Sale Type 4665 non-null object 9 Registration Date 4665 non-null object 10 Quaters 4665 non-null object 11 YR 4665 non-null int64 12 Amount in CR 4665 non-null float64 13 Price Range 4665 non-null object 14 Project Name main 2306 non-null object 15 Rate (Rs/sft) 4665 non-null object 16 Unit 4301 non-null object 17 Floor 4506 non-null object 18 Tower 962 non-null object 19 Wing 1438 non-null object 20 Party 1 4665 non-null object 21 Party 2 4665 non-null object 22 Count 4665 non-null object 23 bus_stop_count_1km 4665 non-null int64 24 financial_facilities_1km 4665 non-null int64 25 medical_facilities_1km 4665 non-null int64 26 hospital_facilities_1km 4665 non-null int64 27 distance_to_metro 4665 non-null float64 28 distance_to_railway 4665 non-null float64 29 school_count_1km 4665 non-null int64 30 kindergarten_count_1km 4665 non-null int64 31 distance_to_supermarket 4665 non-null float64 32 distance_to_techhub 4665 non-null float64 33 distance_to_park 4665 non-null float64 34 distance_to_waterbody 4665 non-null float64 35 park_name 750 non-null object 36 park_area_m2 4665 non-null float64 37 park_lat 4665 non-null float64 38 park_lon 4665 non-null float64 39 waterbody_name 3080 non-null object 40 waterbody_waterbody_type_clean 4665 non-null object 41 waterbody_area_m2 4665 non-null float64 42 waterbody_lat 4665 non-null float64 43 waterbody_lon 4665 non-null float64 44 Floor_clean 4665 non-null int64 45 SaleType_encoded 4665 non-null int64 46 park_leisure_garden 4665 non-null int32 47 park_leisure_park 4665 non-null int32 48 park_leisure_pitch 4665 non-null int32 49 park_leisure_playground 4665 non-null int32 50 park_leisure_recreation_common 4665 non-null int32 51 waterbody_type_lake 4665 non-null int32 52 waterbody_type_pond 4665 non-null int32 53 waterbody_type_reservoir 4665 non-null int32 54 waterbody_type_river 4665 non-null int32 55 log_amount 4665 non-null float64 56 area_raw 4665 non-null float64 57 log_area 4665 non-null float64 dtypes: float64(21), int32(9), int64(9), object(19) memory usage: 1.9+ MB
df.columns
Index(['Area (in Sq. ft)', 'Amount', 'Market Value', 'City', 'Project Name',
'Location', 'Latitude', 'Longitude', 'Sale Type', 'Registration Date',
'Quaters', 'YR', 'Amount in CR', 'Price Range', 'Project Name main',
'Rate (Rs/sft)', 'Unit', 'Floor', 'Tower', 'Wing', 'Party 1', 'Party 2',
'Count', 'bus_stop_count_1km', 'financial_facilities_1km',
'medical_facilities_1km', 'hospital_facilities_1km',
'distance_to_metro', 'distance_to_railway', 'school_count_1km',
'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub', 'distance_to_park', 'distance_to_waterbody',
'park_name', 'park_area_m2', 'park_lat', 'park_lon', 'waterbody_name',
'waterbody_waterbody_type_clean', 'waterbody_area_m2', 'waterbody_lat',
'waterbody_lon', 'Floor_clean', 'SaleType_encoded',
'park_leisure_garden', 'park_leisure_park', 'park_leisure_pitch',
'park_leisure_playground', 'park_leisure_recreation_common',
'waterbody_type_lake', 'waterbody_type_pond',
'waterbody_type_reservoir', 'waterbody_type_river', 'log_amount',
'area_raw', 'log_area'],
dtype='object')
# Drop
df_dropped = df.drop(['Area (in Sq. ft)', 'Amount', 'Market Value', 'City', 'Project Name',
'Location', 'Sale Type', 'Registration Date','hospital_facilities_1km',
'Quaters', 'YR', 'Amount in CR', 'Price Range', 'Project Name main',
'Rate (Rs/sft)', 'Unit', 'Floor', 'Tower', 'Wing', 'Party 1', 'Party 2',
'Count','park_lat', 'park_lon', 'waterbody_name','park_name','waterbody_waterbody_type_clean', 'waterbody_lat',
'waterbody_lon',], axis=1) # axis=1 specifies that we are dropping a column
df_dropped.columns
Index(['Latitude', 'Longitude', 'bus_stop_count_1km',
'financial_facilities_1km', 'medical_facilities_1km',
'distance_to_metro', 'distance_to_railway', 'school_count_1km',
'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub', 'distance_to_park', 'distance_to_waterbody',
'park_area_m2', 'waterbody_area_m2', 'Floor_clean', 'SaleType_encoded',
'park_leisure_garden', 'park_leisure_park', 'park_leisure_pitch',
'park_leisure_playground', 'park_leisure_recreation_common',
'waterbody_type_lake', 'waterbody_type_pond',
'waterbody_type_reservoir', 'waterbody_type_river', 'log_amount',
'area_raw', 'log_area'],
dtype='object')
df_dropped.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4665 entries, 0 to 4664 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Latitude 4665 non-null float64 1 Longitude 4665 non-null float64 2 bus_stop_count_1km 4665 non-null int64 3 financial_facilities_1km 4665 non-null int64 4 medical_facilities_1km 4665 non-null int64 5 distance_to_metro 4665 non-null float64 6 distance_to_railway 4665 non-null float64 7 school_count_1km 4665 non-null int64 8 kindergarten_count_1km 4665 non-null int64 9 distance_to_supermarket 4665 non-null float64 10 distance_to_techhub 4665 non-null float64 11 distance_to_park 4665 non-null float64 12 distance_to_waterbody 4665 non-null float64 13 park_area_m2 4665 non-null float64 14 waterbody_area_m2 4665 non-null float64 15 Floor_clean 4665 non-null int64 16 SaleType_encoded 4665 non-null int64 17 park_leisure_garden 4665 non-null int32 18 park_leisure_park 4665 non-null int32 19 park_leisure_pitch 4665 non-null int32 20 park_leisure_playground 4665 non-null int32 21 park_leisure_recreation_common 4665 non-null int32 22 waterbody_type_lake 4665 non-null int32 23 waterbody_type_pond 4665 non-null int32 24 waterbody_type_reservoir 4665 non-null int32 25 waterbody_type_river 4665 non-null int32 26 log_amount 4665 non-null float64 27 area_raw 4665 non-null float64 28 log_area 4665 non-null float64 dtypes: float64(13), int32(9), int64(7) memory usage: 893.0 KB
import pandas as pd
import geopandas as gpd
import numpy as np
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import shift_colormap
# --- Step 1: Convert to GeoDataFrame ---
gdf = gpd.GeoDataFrame(
df_dropped,
geometry=gpd.points_from_xy(df_dropped['Longitude'], df_dropped['Latitude']),
crs='EPSG:4326'
)
# --- Step 2: Project to UTM (Bangalore ≈ EPSG:32643) ---
gdf_proj = gdf.to_crs(epsg=32643)
# --- Step 3: Define dependent and independent variables ---
y_var = 'log_amount'
X_vars = [
'bus_stop_count_1km', 'financial_facilities_1km', 'medical_facilities_1km', 'distance_to_metro', 'distance_to_railway',
'school_count_1km', 'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub', 'distance_to_park', 'distance_to_waterbody',
'park_area_m2', 'waterbody_area_m2', 'Floor_clean', 'SaleType_encoded',
'park_leisure_garden', 'park_leisure_park',
'park_leisure_pitch', 'park_leisure_playground',
'park_leisure_recreation_common',
'waterbody_type_lake',
'waterbody_type_pond', 'waterbody_type_reservoir',
'waterbody_type_river', 'area_raw', 'log_area'
]
from statsmodels.stats.outliers_influence import variance_inflation_factor
X_df = gdf_proj[X_vars].copy()
# Compute VIF for each variable
vif_data = pd.DataFrame()
vif_data["feature"] = X_df.columns
vif_data["VIF"] = [variance_inflation_factor(X_df.values, i) for i in range(X_df.shape[1])]
vif_data.sort_values("VIF", ascending=False)
| feature | VIF | |
|---|---|---|
| 25 | log_area | 349.771278 |
| 20 | waterbody_type_lake | 144.649901 |
| 21 | waterbody_type_pond | 51.974327 |
| 22 | waterbody_type_reservoir | 28.310408 |
| 24 | area_raw | 25.577184 |
| 23 | waterbody_type_river | 8.556582 |
| 1 | financial_facilities_1km | 7.189323 |
| 4 | distance_to_railway | 5.851528 |
| 2 | medical_facilities_1km | 5.672754 |
| 0 | bus_stop_count_1km | 5.400138 |
| 8 | distance_to_techhub | 5.212559 |
| 5 | school_count_1km | 4.881945 |
| 10 | distance_to_waterbody | 4.832278 |
| 7 | distance_to_supermarket | 4.521687 |
| 16 | park_leisure_park | 4.266484 |
| 3 | distance_to_metro | 3.805006 |
| 17 | park_leisure_pitch | 3.789340 |
| 9 | distance_to_park | 3.725549 |
| 6 | kindergarten_count_1km | 1.933640 |
| 18 | park_leisure_playground | 1.826687 |
| 15 | park_leisure_garden | 1.603634 |
| 12 | waterbody_area_m2 | 1.492221 |
| 14 | SaleType_encoded | 1.415572 |
| 11 | park_area_m2 | 1.244951 |
| 19 | park_leisure_recreation_common | 1.086052 |
| 13 | Floor_clean | 1.006822 |
X_df.columns
Index(['bus_stop_count_1km', 'financial_facilities_1km',
'medical_facilities_1km', 'distance_to_metro', 'distance_to_railway',
'school_count_1km', 'kindergarten_count_1km', 'distance_to_supermarket',
'distance_to_techhub', 'distance_to_park', 'distance_to_waterbody',
'park_area_m2', 'waterbody_area_m2', 'Floor_clean', 'SaleType_encoded',
'park_leisure_garden', 'park_leisure_park', 'park_leisure_pitch',
'park_leisure_playground', 'park_leisure_recreation_common',
'waterbody_type_lake', 'waterbody_type_pond',
'waterbody_type_reservoir', 'waterbody_type_river', 'area_raw',
'log_area'],
dtype='object')
X_vars_gwr = [
'financial_facilities_1km', 'medical_facilities_1km', 'distance_to_railway',
'school_count_1km', 'kindergarten_count_1km',
'distance_to_techhub', 'distance_to_park', 'distance_to_waterbody',
'park_area_m2', 'Floor_clean', 'SaleType_encoded', 'log_area'
]
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
import numpy as np
# --- Prepare Variables ---
y = np.array(gdf_proj[y_var]).reshape((-1, 1))
X = np.array(gdf_proj[X_vars_gwr])
coords = np.column_stack((gdf_proj.geometry.x, gdf_proj.geometry.y))
print(f"Data ready: {X.shape[0]} observations, {X.shape[1]} predictors")
# --- Step 1: Bandwidth Selection ---
bw = Sel_BW(coords, y, X, fixed=True, kernel='gaussian').search()
print(f"\n✅ Optimal Bandwidth: {bw:.2f}")
# --- Step 2: Fit GWR (with variable names included) ---
gwr_model = GWR(
coords,
y,
X,
bw,
fixed=True,
kernel='gaussian',
constant=True,
name_x=X_vars_gwr
).fit()
# --- Step 3: Print Model Summary ---
print("\n=========== GWR Summary with Variable Names ===========")
print(gwr_model.summary())
Data ready: 4665 observations, 12 predictors ✅ Optimal Bandwidth: 652.29 =========== GWR Summary with Variable Names =========== =========================================================================== Model type Gaussian Number of observations: 4665 Number of covariates: 13 Global Regression Results --------------------------------------------------------------------------- Residual sum of squares: 1010.221 Log-likelihood: -3050.813 AIC: 6127.626 AICc: 6129.716 BIC: -38289.145 R2: 0.243 Adj. R2: 0.241 Variable Est. SE t(Est/SE) p-value ------------------------------- ---------- ---------- ---------- ---------- Intercept 12.511 0.130 96.431 0.000 financial_facilities_1km 0.010 0.001 9.047 0.000 medical_facilities_1km -0.005 0.001 -4.312 0.000 distance_to_railway -0.000 0.000 -5.223 0.000 school_count_1km -0.010 0.002 -5.342 0.000 kindergarten_count_1km 0.011 0.003 3.435 0.001 distance_to_techhub -0.000 0.000 -4.760 0.000 distance_to_park -0.000 0.000 -10.815 0.000 distance_to_waterbody -0.000 0.000 -2.837 0.005 park_area_m2 0.000 0.000 2.290 0.022 Floor_clean 0.000 0.000 2.208 0.027 SaleType_encoded -0.085 0.015 -5.487 0.000 log_area 0.495 0.017 28.355 0.000 Geographically Weighted Regression (GWR) Results --------------------------------------------------------------------------- Spatial kernel: Fixed gaussian Bandwidth used: 652.290 Diagnostic information --------------------------------------------------------------------------- Residual sum of squares: 319.433 Effective number of parameters (trace(S)): 1165.218 Degree of freedom (n - trace(S)): 3499.782 Sigma estimate: 0.302 Log-likelihood: -365.223 AIC: 3062.882 AICc: 3841.221 BIC: 10582.470 R2: 0.761 Adjusted R2: 0.681 Adj. alpha (95%): 0.001 Adj. critical t value (95%): 3.454 Summary Statistics For GWR Parameter Estimates --------------------------------------------------------------------------- Variable Mean STD Min Median Max -------------------- ---------- ---------- ---------- ---------- ---------- Intercept 10.998 3.732 -27.470 11.120 21.980 financial_facilities 0.005 0.102 -1.104 0.002 1.673 medical_facilities_1 0.011 0.176 -1.461 -0.005 3.272 distance_to_railway -0.000 0.001 -0.003 -0.000 0.005 school_count_1km -0.004 0.113 -1.314 -0.001 0.500 kindergarten_count_1 -1304184.429 41627221.527 -1270660431.787 -0.003 78661905.723 distance_to_techhub -0.000 0.001 -0.005 -0.000 0.004 distance_to_park -0.000 0.001 -0.013 -0.000 0.007 distance_to_waterbod 0.000 0.001 -0.007 -0.000 0.004 park_area_m2 0.000 0.000 -0.000 -0.000 0.001 Floor_clean 0.013 0.023 -0.091 0.012 0.085 SaleType_encoded -0.143 0.313 -2.598 -0.112 0.808 log_area 0.704 0.466 -1.215 0.687 6.088 =========================================================================== None
import matplotlib.pyplot as plt
import geopandas as gpd
# --- Step 1: Add GWR coefficients to GeoDataFrame ---
# Add 'Intercept' manually since gwr_model itself doesn't store that flag
var_names = ['Intercept'] + X_vars_gwr if gwr_model.params.shape[1] == len(X_vars_gwr) + 1 else X_vars_gwr
for i, var in enumerate(var_names):
gdf_proj[f'{var}_coef'] = gwr_model.params[:, i]
# --- Step 2: Select the variable to visualize ---
var_to_plot = 'distance_to_park_coef' # adjust if your variable name differs
# --- Step 3: Plot heatmap of local coefficients ---
fig, ax = plt.subplots(figsize=(10, 8))
gdf_proj.plot(
column=var_to_plot,
cmap='coolwarm',
legend=True,
scheme='quantiles',
k=5,
alpha=0.9,
ax=ax
)
ax.set_title("Local Effect of Distance to Green Space on Housing Prices (GWR)",
fontsize=14, pad=12)
ax.axis('off')
plt.tight_layout()
plt.show()
# --- Step 1: Add GWR coefficients to GeoDataFrame ---
# Add 'Intercept' manually since gwr_model itself doesn't store that flag
var_names = ['Intercept'] + X_vars_gwr if gwr_model.params.shape[1] == len(X_vars_gwr) + 1 else X_vars_gwr
for i, var in enumerate(var_names):
gdf_proj[f'{var}_coef'] = gwr_model.params[:, i]
# --- Step 2: Select the variable to visualize ---
var_to_plot = 'distance_to_waterbody_coef' # adjust if your variable name differs
# --- Step 3: Plot heatmap of local coefficients ---
fig, ax = plt.subplots(figsize=(10, 8))
gdf_proj.plot(
column=var_to_plot,
cmap='coolwarm',
legend=True,
scheme='quantiles',
k=5,
alpha=0.9,
ax=ax
)
ax.set_title("Local Effect of Distance to Blue Space on Housing Prices (GWR)",
fontsize=14, pad=12)
ax.axis('off')
plt.tight_layout()
plt.show()
# Load BBMP wards or zones shapefile
bbmp = gpd.read_file("BBMP_shape.shp")
# Ensure same CRS as your transaction data
bbmp = bbmp.to_crs(gdf_proj.crs)
import contextily as ctx
import matplotlib.pyplot as plt
import geopandas as gpd
# --- Step 1: Add GWR coefficients to GeoDataFrame ---
# Add 'Intercept' manually since gwr_model itself doesn't store that flag
var_names = ['Intercept'] + X_vars_gwr if gwr_model.params.shape[1] == len(X_vars_gwr) + 1 else X_vars_gwr
for i, var in enumerate(var_names):
gdf_proj[f'{var}_coef'] = gwr_model.params[:, i]
# --- Step 2: Select the variable to visualize ---
var_to_plot = 'distance_to_waterbody_coef' # adjust if your variable name differs
# --- Step 2: Project both datasets to Web Mercator ---
gdf_proj_web = gdf_proj.to_crs(epsg=3857)
bbmp_web = bbmp.to_crs(epsg=3857)
# --- Step 3: Plot coefficient map (e.g., distance_to_park) ---
fig, ax = plt.subplots(figsize=(12, 10))
# Plot BBMP boundary
bbmp_web.boundary.plot(ax=ax, color='black', linewidth=0.5)
# Plot local coefficients (now it exists!)
gdf_proj_web.plot(
column='distance_to_waterbody_coef',
cmap='coolwarm',
legend=True,
scheme='quantiles',
k=5,
alpha=0.8,
ax=ax
)
# Add OSM basemap
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)
# Styling
ax.set_title("Local Effect of Distance to waterbody on Housing Prices (GWR)", fontsize=14)
ax.axis('off')
plt.show()
pip install matplotlib-scalebar
Requirement already satisfied: matplotlib-scalebar in c:\users\sayan\anaconda3\lib\site-packages (0.9.0) Requirement already satisfied: matplotlib in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib-scalebar) (3.9.2) Requirement already satisfied: contourpy>=1.0.1 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (1.4.4) Requirement already satisfied: numpy>=1.23 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (1.26.4) Requirement already satisfied: packaging>=20.0 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (24.1) Requirement already satisfied: pillow>=8 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (10.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\sayan\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->matplotlib-scalebar) (1.16.0) Note: you may need to restart the kernel to use updated packages.
import contextily as ctx
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
# Project both datasets to Web Mercator for contextily
gdf_proj_web = gdf_proj.to_crs(epsg=3857)
bbmp_web = bbmp.to_crs(epsg=3857)
# Plot coefficient map (e.g., distance_to_park)
fig, ax = plt.subplots(figsize=(12, 10))
# Plot BBMP boundary
bbmp_web.boundary.plot(ax=ax, color='black', linewidth=0.5)
# Plot local coefficients
gdf_proj_web.plot(column='distance_to_waterbody_coef',
cmap='coolwarm', legend=True,
scheme='quantiles', k=5, # optional classification
alpha=0.8, ax=ax)
# Add OSM basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
# --- Add map elements ---
ax.set_title("Local Effect of Distance to Waterbody on Housing Prices (GWR)", fontsize=16)
ax.axis('off')
# Add north arrow
ax.text(0.95, 0.1, 'N', transform=ax.transAxes, ha='center', va='center',
fontsize=16, fontweight='bold')
ax.arrow(0.95, 0.12, 0, 0.05, transform=ax.transAxes, color='k',
head_width=0.02, head_length=0.03)
# Add scale bar (using meters as CRS units)
scalebar = ScaleBar(1, units="m", location='lower left')
ax.add_artist(scalebar)
# --- Save high-resolution image ---
output_path = "outputs/gwr_distance_to_waterbody.png"
plt.savefig(output_path, dpi=600, bbox_inches='tight')
plt.close()
print(f"✅ Map saved successfully to: {output_path}")
✅ Map saved successfully to: outputs/gwr_distance_to_waterbody.png
pip install matplotlib-scalebar
Requirement already satisfied: matplotlib-scalebar in c:\users\sayan\anaconda3\lib\site-packages (0.9.0) Requirement already satisfied: matplotlib in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib-scalebar) (3.9.2) Requirement already satisfied: contourpy>=1.0.1 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (1.4.4) Requirement already satisfied: numpy>=1.23 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (1.26.4) Requirement already satisfied: packaging>=20.0 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (24.1) Requirement already satisfied: pillow>=8 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (10.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in c:\users\sayan\anaconda3\lib\site-packages (from matplotlib->matplotlib-scalebar) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\sayan\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->matplotlib-scalebar) (1.16.0) Note: you may need to restart the kernel to use updated packages.
'''
import os
import contextily as ctx
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
# --- Create output folder if it doesn’t exist ---
os.makedirs("outputs", exist_ok=True)
# --- Reproject for web mapping tiles ---
gdf_proj_web = gdf_proj.to_crs(epsg=3857)
bbmp_web = bbmp.to_crs(epsg=3857)
# --- Plot setup ---
fig, ax = plt.subplots(figsize=(12, 10))
# Plot BBMP boundary
bbmp_web.boundary.plot(ax=ax, color='black', linewidth=0.5)
# Plot GWR coefficient map (example variable)
gdf_proj_web.plot(
column='distance_to_park_coef',
cmap='coolwarm',
legend=True,
scheme='quantiles', k=5,
alpha=0.8,
ax=ax
)
# Add OSM basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
# --- Add map elements ---
ax.set_title("Local Effect of Distance to Park on Housing Prices (GWR)", fontsize=16)
ax.axis('off')
# Add north arrow
ax.text(0.95, 0.1, 'N', transform=ax.transAxes, ha='center', va='center',
fontsize=16, fontweight='bold')
ax.arrow(0.95, 0.12, 0, 0.05, transform=ax.transAxes, color='k',
head_width=0.02, head_length=0.03)
# Add scale bar (using meters as CRS units)
scalebar = ScaleBar(1, units="m", location='lower left')
ax.add_artist(scalebar)
# --- Save high-resolution image ---
output_path = "outputs/gwr_distance_to_park.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"✅ Map saved successfully to: {output_path}")
'''
'\nimport os\nimport contextily as ctx\nimport matplotlib.pyplot as plt\nfrom matplotlib_scalebar.scalebar import ScaleBar\n\n# --- Create output folder if it doesn’t exist ---\nos.makedirs("outputs", exist_ok=True)\n\n# --- Reproject for web mapping tiles ---\ngdf_proj_web = gdf_proj.to_crs(epsg=3857)\nbbmp_web = bbmp.to_crs(epsg=3857)\n\n# --- Plot setup ---\nfig, ax = plt.subplots(figsize=(12, 10))\n\n# Plot BBMP boundary\nbbmp_web.boundary.plot(ax=ax, color=\'black\', linewidth=0.5)\n\n# Plot GWR coefficient map (example variable)\ngdf_proj_web.plot(\n column=\'distance_to_park_coef\',\n cmap=\'coolwarm\',\n legend=True,\n scheme=\'quantiles\', k=5,\n alpha=0.8,\n ax=ax\n)\n\n# Add OSM basemap\nctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)\n\n# --- Add map elements ---\nax.set_title("Local Effect of Distance to Park on Housing Prices (GWR)", fontsize=16)\nax.axis(\'off\')\n\n# Add north arrow\nax.text(0.95, 0.1, \'N\', transform=ax.transAxes, ha=\'center\', va=\'center\',\n fontsize=16, fontweight=\'bold\')\nax.arrow(0.95, 0.12, 0, 0.05, transform=ax.transAxes, color=\'k\',\n head_width=0.02, head_length=0.03)\n\n# Add scale bar (using meters as CRS units)\nscalebar = ScaleBar(1, units="m", location=\'lower left\')\nax.add_artist(scalebar)\n\n# --- Save high-resolution image ---\noutput_path = "outputs/gwr_distance_to_park.png"\nplt.savefig(output_path, dpi=300, bbox_inches=\'tight\')\nplt.close()\n\nprint(f"✅ Map saved successfully to: {output_path}")\n'
# --- Step 1: Add local R² values to your GeoDataFrame ---
# gwr_model.localR2 gives an array of local R² for each observation
gdf_proj['local_R2'] = gwr_model.localR2
# --- Step 2: Project both datasets to Web Mercator ---
gdf_proj_web = gdf_proj.to_crs(epsg=3857)
bbmp_web = bbmp.to_crs(epsg=3857)
# --- Step 3: Plot Local R² map ---
fig, ax = plt.subplots(figsize=(12, 10))
# Plot BBMP boundary
bbmp_web.boundary.plot(ax=ax, color='grey', linewidth=0.5)
# Plot local R²
gdf_proj_web.plot(
column='local_R2',
cmap='viridis',
legend=True,
alpha=0.8,
scheme='quantiles', # optional for clearer breaks
k=5,
ax=ax
)
# Add light basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
# Style
ax.set_title("Local R² from GWR Model", fontsize=14, pad=12)
ax.axis('off')
plt.tight_layout()
plt.show()
'''
import os
import contextily as ctx
import matplotlib.pyplot as plt
# --- Create output folder if it doesn’t exist ---
os.makedirs("outputs", exist_ok=True)
# --- Plot Local R² ---
fig, ax = plt.subplots(figsize=(12, 10))
# Plot BBMP boundary
bbmp_web.boundary.plot(ax=ax, color='grey', linewidth=0.5)
# Plot local R² values
gdf_proj_web.plot(
column='local_R2',
cmap='viridis',
legend=True,
alpha=0.8,
ax=ax
)
# Add OSM basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
# Style and title
ax.set_title("Local R² from GWR Model", fontsize=14)
ax.axis('off')
# --- Save image ---
output_path = "outputs/gwr_local_R2_map.png"
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"✅ Map saved successfully at: {output_path}")
'''
'\nimport os\nimport contextily as ctx\nimport matplotlib.pyplot as plt\n\n# --- Create output folder if it doesn’t exist ---\nos.makedirs("outputs", exist_ok=True)\n\n# --- Plot Local R² ---\nfig, ax = plt.subplots(figsize=(12, 10))\n\n# Plot BBMP boundary\nbbmp_web.boundary.plot(ax=ax, color=\'grey\', linewidth=0.5)\n\n# Plot local R² values\ngdf_proj_web.plot(\n column=\'local_R2\',\n cmap=\'viridis\',\n legend=True,\n alpha=0.8,\n ax=ax\n)\n\n# Add OSM basemap\nctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)\n\n# Style and title\nax.set_title("Local R² from GWR Model", fontsize=14)\nax.axis(\'off\')\n\n# --- Save image ---\noutput_path = "outputs/gwr_local_R2_map.png"\nplt.savefig(output_path, dpi=300, bbox_inches=\'tight\')\nplt.close()\n\nprint(f"✅ Map saved successfully at: {output_path}")\n'
🧮 For comparison — using your spatial models: Model Coefficient for distance_to_park % Change (100 m) Interpretation OLS −0.00038 −3.73 % Baseline global effect SEM −0.00036 −3.53 % After controlling for spatial error SLM −0.00022 −2.18 % After including spatial lag (spillover dampens the direct effect) GWR (mean) roughly −0.000 (varies spatially) typically between −1 % and −4 %, depending on local context Spatially varying effects 🧭 Summary Interpretation
The “distance to park” effect is negative and significant in every model → confirms strong amenity value of green proximity.
The effect magnitude shrinks in spatial models, especially SLM, because part of the price premium is spatially autocorrelated (captured by ρ = 0.52).
GWR shows this effect is not uniform — in some neighborhoods, the premium of being near a park can be much stronger (4–5 %), while in others negligible or slightly reversed.
from platform import python_version
print(python_version())
3.12.7